##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 3: Results reported in the paper
##################################################################
##################################################################

# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script

# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5

# load packages required to run this script
library(estimatr)  # CRAN v0.22.0
library(broom)     # CRAN v0.5.5
library(cowplot)   # CRAN v1.0.0
library(ggeffects) # CRAN v0.14.2
library(dplyr)     # CRAN v1.0.0
library(tidyr)     # CRAN v1.1.0
library(ggplot2)   # CRAN v3.3.2
library(car)       # CRAN v3.0-7
library(cowplot)   # CRAN v1.0.0
library(Zelig)     # CRAN v5.1.6.1
library(scales)    # CRAN v1.1.1

# create custom ggplot2 scheme

theme_baser <- function (){
    theme_minimal()  %+replace%
        theme(panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.major.y = element_blank(),
              panel.border = element_rect(fill = NA,colour = "black", size = 0.5,
                                          linetype = "solid"),
              legend.title = element_text(size = 15),
              plot.title = element_text(size = 15, face = "italic",
                                        vjust = 1.5, hjust = 0,
                                        margin=margin(0, 0, 12 ,0)),
              legend.position = "bottom",
              axis.ticks = element_line(size = 0.3),
              axis.ticks.length = unit(0.2, "cm"),
              legend.text=element_text(size = 13),
              strip.text = element_text(size = 15, hjust = 0.5,
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              axis.text = element_text(colour = "black", size = 13),
              axis.title = element_text(size = 13, hjust = 0.5))
}

# set theme
theme_set(theme_baser())

# load classified manifestos (created in previous script)

dat_combined_all <- readRDS("data_manifestos_classified.rds")

length(unique(dat_combined_all$manifesto_id))

# check how many long sentences and in which manifestos

dat_combined_long <- dat_combined_all %>% 
    group_by(manifesto_id, countryname, party, partyname, date) %>% 
    summarise(mean_ntoken = mean(ntoken),
              max_ntoken = max(ntoken),
              median_ntoken = median(ntoken),
              min_tnoken = min(ntoken), 
              sum_ntoken = sum(ntoken)) %>% 
    arrange(-max_ntoken)


# remove all sentences with more than 99 tokens
dat_combined <- filter(dat_combined_all, ntoken < 100)

# get overview of proportions of each class (reported in the paper)

prop.table(table(dat_combined$class))

dat_combined$year <- as.numeric(dat_combined$year)

length(unique(dat_combined$election_id))

nrow(dat_combined)
nrow(dat_combined_all) - nrow(dat_combined)

# removing sentences longer 99 words reduces the corpus by...
(nrow(dat_combined_all) - nrow(dat_combined)) / nrow(dat_combined_all) * 100
# per cent

# check what proportions of sentences is annotated
nrow(filter(dat_combined, annotations == "TRUE")) / nrow(dat_combined) * 100

nrow(dat_combined)

table(dat_combined$annotations)

# calculate absolute number of words and proportions per class and manifesto
dat_combined_props <- dat_combined %>% 
    group_by(countryname, partyname, party_family, language, date, incumbency_status2_factor,
             manifesto_id, extremist_party, gdp_growth_lag, 
             rile, year, class) %>%
    summarise(n_sentences = n(),
              n_words = sum(ntoken)) %>%
    mutate(relfreq_sentences = n_sentences / sum(n_sentences),
           relfreq_words  = n_words / sum(n_words))

# calculate the proportion of future to past sentences (future) / (future + past)
dat_combined_props_past_future <- dat_combined %>% 
    group_by(countryname, partyname, party_family, language, date, incumbency_status2_factor,
             manifesto_id, extremist_party, gdp_growth_lag, 
             rile, year, class) %>%
    summarise(n_sentences = n(),
              n_words = sum(ntoken)) %>%
    spread(class, n_sentences) %>% 
    group_by(manifesto_id) %>% 
    mutate(prop_past_future = sum(Future, na.rm = TRUE) / (sum(Future, na.rm = TRUE) + sum(Past, na.rm = TRUE)))

dat_combined_props_past_future <- dat_combined_props_past_future %>% 
    select(countryname, partyname, party_family, language, date, incumbency_status2_factor,
           manifesto_id, extremist_party, gdp_growth_lag, 
           rile, year, prop_past_future) %>%
    unique() %>% 
    mutate(election_id = paste0(countryname, date))


dat_combined_props_past_future$incumbency_status2_factor <- factor(
    dat_combined_props_past_future$incumbency_status2_factor,
    levels = c("Incumbent", "Opposition")
)



# filter manifestos with fewer than three classes (past is missing)
dat_combined_props_2 <- dat_combined_props %>% 
    group_by(manifesto_id) %>% 
    mutate(n_classes = n()) %>% 
    filter(n_classes < 3) %>%
    select(countryname:year) %>% 
    unique()

# change past information as 0 for these manifestos
dat_combined_props_2_past <- dat_combined_props_2 %>% 
    mutate(class = "Past",
           relfreq_sentences = 0,
           relfreq_words = 0,
           n_sentences = 0,
           n_words = 0)


nrow(dat_combined_props_2)

# bind the two data frames
dat_combined_props <- bind_rows(dat_combined_props, 
                                dat_combined_props_2_past)


# descriptive statistics for paper
dat_combined_props %>% 
    group_by(class) %>% 
    summarise(mean = mean(relfreq_sentences), 
              median = median(relfreq_sentences),
              max = max(relfreq_sentences),
              min = min(relfreq_sentences))

# relevel the class variable
dat_combined_props$class <- factor(dat_combined_props$class,
                                   levels = c("Future", "Present", "Past"))

# Figure 1 ----
ggplot(dat_combined_props, aes(x = class, 
                               y = relfreq_sentences,
                               colour = incumbency_status2_factor)) +
    geom_boxplot(position = position_dodge(width = 0.6),
                 width = 0.5) +
    scale_colour_manual(values = c("black", "red")) +
    facet_wrap(~countryname, scales = "free_x") +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       breaks = c(seq(0, 1, 0.25)), limits = c(0, 1)) +
    labs(x = NULL, y = "Percentage of temporal emphasis in party manifestos") +
    theme(legend.title = element_blank(),
          legend.position = "bottom")
ggsave("fg1.pdf",
       width = 9, height = 6)
ggsave("fg1.eps",
       width = 9, height = 6)



# formulas for sentiment
sent_log <- function(positive, negative, neg_negative, neg_positive, 
                     negations = FALSE) {
    if (negations == FALSE) {
        log((sum(positive) + 0.5) / (sum(negative) + 0.5))
    }
    
    else {
        log((sum(positive) + sum(neg_negative, na.rm = TRUE) + 0.5) / (sum(negative) + sum(neg_positive, na.rm = TRUE) + 0.5))
        
    }
}


sent_crabtreeetal <- function(positive, negative, neg_negative, neg_positive,
                              ntoken = ntoken, negations = FALSE) {
    
    if (negations == FALSE) {
        100 * (sum(positive) - sum(negative)) / sum(ntoken)
    }
    
    else {
        100 * (sum(positive) + sum(neg_negative, na.rm = TRUE) - sum(negative) - sum(neg_positive, na.rm = TRUE)) / sum(ntoken)
    }
    
}

# estimate sentiment on the level of manifesto-classes and for the entire manifesto
dat_combined_class3 <- dat_combined %>% 
    group_by(manifesto_id, class) %>% 
    mutate(n_sentences_class = n(),
           n_words_class = sum(ntoken, na.rm = TRUE)) %>% 
    group_by(manifesto_id) %>% 
    mutate(n_sentences_manifesto =  n(),
           n_words_manifesto = sum(ntoken, na.rm = TRUE)) %>% 
    ungroup() %>% 
    group_by(manifesto_id, class) %>% 
    mutate(sent_liwc_log = sent_log(positive_liwc, negative_liwc)) %>% 
    mutate(sent_lsd_log = sent_log(positive_lsd, negative_lsd, 
                                   neg_negative = neg_negative_lsd,
                                   neg_positive = neg_positive_lsd,
                                   negations = TRUE)) %>% 
    mutate(sent_rauh_log = sent_log(positive_rauh, negative_rauh,
                                    neg_negative = neg_negative_rauh,
                                    neg_positive = neg_positive_rauh)) %>% 
    mutate(sent_liwc15_log = sent_log(positive_liwc15, negative_liwc15)) %>% 
    mutate(sent_liwc_crabtreeetal = sent_crabtreeetal(positive_liwc, negative_liwc,
                                                      ntoken = ntoken)) %>% 
    mutate(sent_liwc15_crabtreeetal = sent_crabtreeetal(positive_liwc15, negative_liwc15,
                                                        ntoken = ntoken)) %>% 
    mutate(sent_lsd_crabtreeetal = sent_crabtreeetal(positive_lsd, negative_lsd,
                                                     neg_negative = neg_negative_lsd,
                                                     neg_positive = neg_positive_lsd,
                                                     ntoken = ntoken)) %>% 
    mutate(sent_rauh_crabtreeetal = sent_crabtreeetal(positive_rauh, negative_rauh,
                                                      neg_negative = neg_negative_rauh,
                                                      neg_positive = neg_positive_rauh,
                                                      ntoken = ntoken))


# aggregate manifestos to 3 observations (one per class), 
# select relevant variables, and keep only one observation per manifesto-class
dat_aggregated_class3 <- dat_combined_class3 %>%
    ungroup() %>% 
    select(countryname, year, language,
           party, partyname, 
           party_name, 
           extremist_party,
           class,
           edate, starts_with("n_"),
           manifesto_id,
           election_date, election_id,
           seat_share_cmp_lag,
           starts_with("incumbency_"),
           party_family,
           rile,
           starts_with("sent_"),
           starts_with("gdp"),
           starts_with("inflation"),
           starts_with("unemployment")) %>% 
    unique()


# factors for country and extremist party
dat_aggregated_class3$countryname <- factor(dat_aggregated_class3$countryname)

dat_aggregated_class3$class <- factor(dat_aggregated_class3$class)


dat_aggregated_class3$extremist_party <- factor(dat_aggregated_class3$extremist_party,
                                                levels = c("Not extremist party", "Extremist party"))
# square Rile scores
dat_aggregated_class3$rile_squared <- dat_aggregated_class3$rile^2



# run regression used to calculate expected values and construct Figure 2a

lm_sent_liwc_extended_gdp <- lm_robust(sent_liwc_crabtreeetal ~ 
                                           incumbency_status2_factor * class + countryname +
                                           poly(rile, 2) + extremist_party + year + gdp_growth_lag,
                                       data = dat_aggregated_class3,
                                       cluster = manifesto_id)


pred_sent_inc2 <- ggpredict(lm_sent_liwc_extended_gdp, 
                            terms = c("class", 
                                      "incumbency_status2_factor"))

pred_sent_inc2 <- pred_sent_inc2 %>% 
    mutate(tense = car::recode(x, "'1'='Past';'2'='Present';'3'='Future'"))

pred_sent_inc2$tense <- factor(pred_sent_inc2$tense, 
                               levels = c("Past", "Present", "Future"))


# Figure 2a ----
plot_fig2a <- ggplot(pred_sent_inc2, aes(x = tense, 
                           y = predicted,
                           ymin = conf.low,
                           ymax = conf.high, 
                           colour = group,
                           shape = group)) +
    geom_point(position = position_dodge(width = 0.3), size = 3.5) +
    geom_linerange(position = position_dodge(width = 0.3), size = 0.6) +
    geom_linerange(aes(ymin = predicted - 1.645 * std.error,
                       ymax = predicted + 1.645 * std.error), 
                   position = position_dodge(width = 0.3), 
                   size = 1.3) +
    scale_shape_manual(values = c(17, 16)) +
    scale_y_continuous(limits = c(-0.5, 4), breaks = c(seq(0, 4, 1))) +
    scale_colour_manual(values = c("red", "black")) +
    labs(x = "Temporal focus", y = "Sentiment",
         title = "(a) Expected values of sentiment") +
    theme(legend.title = element_blank(),
          legend.position = c(0.8, 0.2),
          legend.background = element_rect(colour = 'grey60', fill = 'white', linetype='solid'),
          legend.text = element_text(size = 10),
          axis.title.x = element_text(size = 15))


# run Zelig models for first difference simulations
lm_zelig <- zelig(sent_liwc_crabtreeetal ~ 
                      incumbency_status2_factor * class +
                      countryname +
                      rile + rile_squared + extremist_party +
                      year + gdp_growth_lag,
                  model= "ls",
                  data = dat_aggregated_class3)


# set explanatory variable values
x_past_inc <- setx(lm_zelig, incumbency_status2_factor = "Incumbent", 
                   class = "Past")

x_past_opp <- setx(lm_zelig, incumbency_status2_factor = "Opposition", 
                   class = "Past")

x_present_inc <- setx(lm_zelig, incumbency_status2_factor = "Incumbent", 
                      class = "Present")

x_present_opp <- setx(lm_zelig, incumbency_status2_factor = "Opposition", 
                      class = "Present")

x_future_inc <- setx(lm_zelig, incumbency_status2_factor = "Incumbent", 
                     class = "Future")

x_future_opp <- setx(lm_zelig, incumbency_status2_factor = "Opposition", 
                     class = "Future")


set.seed(34)
lm_zelig_sim_past <- sim(lm_zelig, x = x_past_opp, x1 = x_past_inc, num = 1000)

set.seed(34)
lm_zelig_sim_present <- sim(lm_zelig, x = x_present_opp, x1 = x_present_inc, num = 1000)

set.seed(34)
lm_zelig_sim_future <- sim(lm_zelig, x = x_future_opp, x1 = x_future_inc, num = 1000)

# get first difference values
fd_present <- unlist(lm_zelig_sim_present$sim.out[["x1"]][["fd"]])
fd_future <- unlist(lm_zelig_sim_future$sim.out[["x1"]][["fd"]])
fd_past <- unlist(lm_zelig_sim_past$sim.out[["x1"]][["fd"]])

fd_future_dataframe <- data.frame(fd = fd_future,
                                  class = "Future")
fd_past_dataframe <- data.frame(fd = fd_past,
                                class = "Past")
fd_present_dataframe <- data.frame(fd = fd_present,
                                   class = "Present")


# calculate percentage of simulations with first differences
# exceeding 0

fd_smaller_0 <- filter(fd_future_dataframe, fd < 0) %>% nrow()

100 * ((1000 - fd_smaller_0) / 1000)

# summarise the simulations
dat_fd_sum_future <- data.frame(
    mean = mean(fd_future),
    sd = sd(fd_future),
    ci_95_low  = quantile(fd_future, .025),
    ci_95_high = quantile(fd_future, 0.975),
    ci_90_low  = quantile(fd_future, .05),
    ci_90_high = quantile(fd_future, 0.95),
    class = "Future"
)

dat_fd_sum_past <- data.frame(
    mean = mean(fd_past),
    sd = sd(fd_past),
    ci_95_low  = quantile(fd_past, .025),
    ci_95_high = quantile(fd_past, 0.975),
    ci_90_low  = quantile(fd_past, .05),
    ci_90_high = quantile(fd_past, 0.95),
    class = "Past"
)


dat_fd_sum_present <- data.frame(
    mean = mean(fd_present),
    sd = sd(fd_present),
    ci_95_low  = quantile(fd_present, .025),
    ci_95_high = quantile(fd_present, 0.975),
    ci_90_low  = quantile(fd_present, .05),
    ci_90_high = quantile(fd_present, 0.95),
    class = "Present"
)

# bind the summarised first differences into one single dataframe
dat_fd_sum_merged <- bind_rows(dat_fd_sum_present,
                               dat_fd_sum_past,
                               dat_fd_sum_future)

# check how much larger the difference in past is compared to the future
dat_fd_sum_past$mean / dat_fd_sum_future$mean 

dat_fd_sum_past$mean
dat_fd_sum_future$mean 

dat_fd_sum_merged$class <- factor(dat_fd_sum_merged$class,
                                    levels = c("Past", "Present", "Future"))

# bind all simulations into one data frame
fd_dataframe_merged <- bind_rows(fd_future_dataframe,
                                 fd_past_dataframe,
                                 fd_present_dataframe)

fd_dataframe_merged$class <- factor(fd_dataframe_merged$class,
                                    levels = c("Past", "Present", "Future"))

# Figure 2b ----
plot_fig2b <- ggplot(fd_dataframe_merged, aes(x = fd)) + 
    geom_density(fill = "grey75", colour = NA) + 
    geom_rug(colour = "grey20") +
    facet_grid(class ~., scales = "free", space = "free") +
    scale_x_continuous(limits = c(-0.3, 1.6), breaks = c(seq(-0.25, 1.5, 0.25))) +
    scale_y_continuous(breaks= c(0, 1, 2)) +
    geom_vline(xintercept = 0, linetype = "dashed", colour = "red") + 
    geom_vline(data = dat_fd_sum_merged, 
               mapping = aes(xintercept = mean), colour = "grey40") +
    labs(x = "First difference", 
         y = NULL,
         title = "(b) Distribution of simulated first differences") +
    theme(strip.text.y = element_text(angle = 0),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.title.x = element_text(size = 15))
plot_fig2b


# merge plot_fig2a and plot_fig2b using the cowplot package

plot_grid(plot_fig2a, plot_fig2b, ncol = 2)
ggsave("fg2.pdf",
       width = 10, height = 4)
ggsave("fg2.eps",
       width = 10, height = 4)

